1 Overview

This report analyzes kinship likelihood ratio (LR) simulations comparing two tested hypotheses:

  1. Parent-Child Hypothesis: LR for testing if pairs are parent-child vs unrelated
  2. Full-Siblings Hypothesis: LR for testing if pairs are full siblings vs unrelated

For each hypothesis, we examine performance across:

  • True positives: Correctly identifying the target relationship
  • Related false positives: Other relatives incorrectly exceeding threshold (e.g., half-siblings misidentified as full siblings)
  • Unrelated false positives: Unrelated pairs incorrectly exceeding threshold

2 Data Loading and Preprocessing

# Generate toy simulation data for testing the analysis pipeline
set.seed(42)

# Define factor levels
population_order <- c("all", "AfAm", "Asian", "Cauc", "Hispanic")
relationship_order <- c("parent_child", "full_siblings", "half_siblings", 
                        "cousins", "second_cousins", "unrelated")
loci_set_order <- c("core_13", "identifiler_15", "expanded_20", 
                    "supplementary", "autosomal_29")

# ONLY these two hypotheses are tested
tested_hypotheses <- c("parent_child", "full_siblings")

# Number of loci in each set
loci_counts <- c(
  "core_13" = 13,
  "identifiler_15" = 15,
  "expanded_20" = 20,
  "supplementary" = 23,
  "autosomal_29" = 29
)

# Mean per-locus log10(LR) matrix for the TWO tested hypotheses only
mean_log10_lr_matrix <- list(
  # Testing PARENT-CHILD hypothesis
  parent_child = c(
    "parent_child" = 0.40,      # True positive
    "full_siblings" = 0.10,     # Related FP (siblings look somewhat like PC)
    "half_siblings" = -0.05,    # Related FP
    "cousins" = -0.25,          # Related FP  
    "second_cousins" = -0.30,   # Related FP
    "unrelated" = -0.35         # Unrelated FP
  ),
  # Testing FULL-SIBLING hypothesis
  full_siblings = c(
    "parent_child" = 0.35,      # Related FP (PC looks like FS)
    "full_siblings" = 0.25,     # True positive
    "half_siblings" = 0.063,    # Related FP - POSITIVE! (key finding)
    "cousins" = -0.232,         # Related FP
    "second_cousins" = -0.283,  # Related FP
    "unrelated" = -0.340        # Unrelated FP
  )
)

# SD of per-locus log10(LR)
sd_log10_lr_per_locus <- 0.20

# Population effects (slight variation by ancestry)
pop_effects <- c(
  "all" = 0,
  "AfAm" = 0.015,
  "Asian" = -0.012,
  "Cauc" = 0.008,
  "Hispanic" = -0.008
)

n_pairs <- params$n_pairs_per_group

# Generate data - ONLY for parent_child and full_siblings hypotheses
generate_toy_data <- function() {
  all_data <- list()
  
  for (pop in population_order) {
    for (known_rel in relationship_order) {
      for (tested_rel in tested_hypotheses) {  # Only PC and FS hypotheses
        for (loci_set in loci_set_order) {
          n_loci <- loci_counts[loci_set]
          
          # Get mean per-locus LR for this known/tested combination
          mean_per_locus <- mean_log10_lr_matrix[[tested_rel]][known_rel]
          sd_per_locus <- sd_log10_lr_per_locus
          
          # Add population effect
          pop_effect <- pop_effects[pop]
          
          # Combined LR is sum of per-locus log10(LR)
          mean_combined <- n_loci * (mean_per_locus + pop_effect)
          sd_combined <- sqrt(n_loci) * sd_per_locus
          
          log10_lr <- rnorm(n_pairs, mean = mean_combined, sd = sd_combined)
          combined_lr <- 10^log10_lr
          
          group_data <- data.table(
            batch_id = 1,
            pair_id = 1:n_pairs,
            population = pop,
            known_relationship = known_rel,
            tested_relationship = tested_rel,
            tested_population = pop,
            loci_set = loci_set,
            combined_LR = combined_lr,
            is_correct_pop = TRUE
          )
          
          all_data[[length(all_data) + 1]] <- group_data
        }
      }
    }
  }
  
  rbindlist(all_data)
}

if (params$use_toy_data) {
  cat("Generating toy simulation data...\n")
  all_combined <- generate_toy_data()
  cat("Generated", nrow(all_combined), "rows of toy data\n")
} else {
  cat("Loading data from:", params$input_file, "\n")
  all_combined <- readRDS(params$input_file)
  setDT(all_combined)
}
## Generating toy simulation data...
## Generated 150000 rows of toy data
cat("Total rows:", nrow(all_combined), "\n")
## Total rows: 150000
cat("Unique pairs per group:", params$n_pairs_per_group, "\n")
## Unique pairs per group: 500
cat("Tested hypotheses:", paste(tested_hypotheses, collapse = ", "), "\n")
## Tested hypotheses: parent_child, full_siblings
relationship_labels <- c(
  "parent_child" = "Parent-Child",
  "full_siblings" = "Full Siblings", 
  "half_siblings" = "Half Siblings",
  "cousins" = "Cousins",
  "second_cousins" = "Second Cousins",
  "unrelated" = "Unrelated"
)

loci_set_labels <- c(
  "core_13" = "Core 13",
  "identifiler_15" = "Identifiler 15",
  "expanded_20" = "Expanded 20",
  "supplementary" = "Supplementary 23",
  "autosomal_29" = "Autosomal 29"
)

population_labels <- c(
  "all" = "All Populations",
  "AfAm" = "African American",
  "Asian" = "Asian",
  "Cauc" = "Caucasian",
  "Hispanic" = "Hispanic"
)

all_combined[, `:=`(
  population = factor(population, levels = population_order),
  tested_population = factor(tested_population, levels = population_order),
  known_relationship = factor(known_relationship, levels = relationship_order),
  tested_relationship = factor(tested_relationship, levels = tested_hypotheses),
  loci_set = factor(loci_set, levels = loci_set_order)
)]

# Add log10_LR column
all_combined[, log10_LR := log10(pmax(combined_LR, 1e-20))]

population_colors <- c(
  "all" = "#FF7F00",
  "AfAm" = "#E41A1C",
  "Asian" = "#377EB8",
  "Cauc" = "#4DAF4A",
  "Hispanic" = "#984EA3"
)

relationship_colors <- c(
  "parent_child" = "#1b9e77",
  "full_siblings" = "#d95f02",
  "half_siblings" = "#7570b3",
  "cousins" = "#e7298a",
  "second_cousins" = "#66a61e",
  "unrelated" = "#666666"
)

2.1 Data Summary

# Show counts by population and known relationship
data_summary <- all_combined[, .(
  n_pairs = uniqueN(paste(batch_id, pair_id)),
  n_tested_hypotheses = uniqueN(tested_relationship),
  n_loci_sets = uniqueN(loci_set)
), by = .(population, known_relationship)]

data_summary_wide <- dcast(data_summary, known_relationship ~ population, 
                           value.var = "n_pairs", fill = 0)

kable(data_summary_wide, 
      caption = "Number of simulated pairs by population and known relationship",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Number of simulated pairs by population and known relationship
known_relationship all AfAm Asian Cauc Hispanic
parent_child 500 500 500 500 500
full_siblings 500 500 500 500 500
half_siblings 500 500 500 500 500
cousins 500 500 500 500 500
second_cousins 500 500 500 500 500
unrelated 500 500 500 500 500
cat("\nEach cell shows", params$n_pairs_per_group, "pairs tested under", 
    length(tested_hypotheses), "hypotheses (Parent-Child, Full-Siblings) across", 
    length(loci_set_order), "loci sets.\n")
## 
## Each cell shows 500 pairs tested under 2 hypotheses (Parent-Child, Full-Siblings) across 5 loci sets.

3 Combined LR Distributions

3.1 Parent-Child Hypothesis

Testing: Are these pairs parent-child vs unrelated?

3.1.1 All Populations Combined

plot_lr_by_hypothesis <- function(data, tested_hyp, pop_filter = NULL) {
  subset_data <- data[tested_relationship == tested_hyp]
  if (!is.null(pop_filter)) {
    subset_data <- subset_data[population == pop_filter]
  }
  
  title_pop <- if (is.null(pop_filter)) "All Populations" else population_labels[pop_filter]
  
  ggplot(subset_data, aes(x = loci_set, y = log10_LR, fill = known_relationship)) +
    geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) +
    facet_wrap(~known_relationship, scales = "free_y", ncol = 3,
               labeller = labeller(known_relationship = relationship_labels)) +
    scale_fill_manual(values = relationship_colors, guide = "none") +
    scale_x_discrete(labels = loci_set_labels) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red", alpha = 0.7) +
    labs(
      title = paste("log10(Combined LR) - Testing", relationship_labels[tested_hyp], "Hypothesis"),
      subtitle = paste(title_pop, "| Red line = LR=1 (no evidence either way)"),
      x = "Loci Set",
      y = "log10(Combined LR)"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
}

plot_lr_by_hypothesis(all_combined, "parent_child", "all")

3.1.2 By Population

subset_pc <- all_combined[tested_relationship == "parent_child"]

ggplot(subset_pc, aes(x = loci_set, y = log10_LR, fill = known_relationship)) +
  geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) +
  facet_grid(population ~ known_relationship, scales = "free_y",
             labeller = labeller(population = population_labels,
                                 known_relationship = relationship_labels)) +
  scale_fill_manual(values = relationship_colors, guide = "none") +
  scale_x_discrete(labels = loci_set_labels) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
  labs(
    title = "log10(Combined LR) - Testing Parent-Child Hypothesis",
    subtitle = "Rows = Population, Columns = True Relationship. Red line = LR=1",
    x = "Loci Set",
    y = "log10(Combined LR)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
        strip.text = element_text(size = 8))

3.2 Full-Siblings Hypothesis

Testing: Are these pairs full siblings vs unrelated?

3.2.1 All Populations Combined

plot_lr_by_hypothesis(all_combined, "full_siblings", "all")

3.2.2 By Population

subset_fs <- all_combined[tested_relationship == "full_siblings"]

ggplot(subset_fs, aes(x = loci_set, y = log10_LR, fill = known_relationship)) +
  geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) +
  facet_grid(population ~ known_relationship, scales = "free_y",
             labeller = labeller(population = population_labels,
                                 known_relationship = relationship_labels)) +
  scale_fill_manual(values = relationship_colors, guide = "none") +
  scale_x_discrete(labels = loci_set_labels) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red", alpha = 0.5) +
  labs(
    title = "log10(Combined LR) - Testing Full-Siblings Hypothesis",
    subtitle = "Rows = Population, Columns = True Relationship. Red line = LR=1",
    x = "Loci Set",
    y = "log10(Combined LR)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
        strip.text = element_text(size = 8))

3.3 Mean log10(LR) Summary

# Calculate mean log10(LR) for each combination at 29 loci
lr_means <- all_combined[loci_set == "autosomal_29", .(
  mean_log10_LR = mean(log10_LR),
  sd_log10_LR = sd(log10_LR),
  n = .N
), by = .(population, known_relationship, tested_relationship)]

# Plot: facet by tested_relationship and population
ggplot(lr_means, aes(x = known_relationship, y = mean_log10_LR, fill = known_relationship)) +
  geom_col() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  facet_grid(tested_relationship ~ population, 
             labeller = labeller(population = population_labels,
                                 tested_relationship = relationship_labels)) +
  scale_fill_manual(values = relationship_colors, guide = "none") +
  scale_x_discrete(labels = function(x) str_wrap(relationship_labels[x], width = 8)) +
  labs(
    title = "Mean log10(Combined LR) at 29 Autosomal Loci",
    subtitle = "Rows = Tested Hypothesis, Columns = Population. Red line = LR=1.",
    x = "True (Known) Relationship",
    y = "Mean log10(Combined LR)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
        strip.text = element_text(size = 9))

fwrite(lr_means, file.path(params$output_dir, "mean_log10_LR_summary.csv"))

How to interpret:

  • Bars above red line: mean LR > 1, evidence FOR the tested hypothesis
  • Bars below red line: mean LR < 1, evidence AGAINST the tested hypothesis
  • True positives should have tall positive bars (e.g., full_siblings under Full-Siblings hypothesis)
  • Related false positives may have positive bars (e.g., half-siblings under Full-Siblings hypothesis!)

4 Proportions Exceeding Thresholds

4.1 Calculate Proportions

fixed_thresholds <- params$fixed_thresholds

# Calculate proportions for all combinations
proportions_all <- all_combined[, {
  props <- sapply(fixed_thresholds, function(t) mean(combined_LR > t, na.rm = TRUE))
  names(props) <- paste0("prop_LR_gt_", fixed_thresholds)
  as.list(c(n = .N, props))
}, by = .(population, known_relationship, tested_relationship, loci_set)]

proportions_all[, n_loci := loci_counts[as.character(loci_set)]]

# Add classification column
proportions_all[, classification := fcase(
  as.character(known_relationship) == as.character(tested_relationship), "True Positive",
  known_relationship == "unrelated", "Unrelated FP",
  default = "Related FP"
)]

fwrite(proportions_all, file.path(params$output_dir, "proportions_all_combinations.csv"))

4.2 Parent-Child Hypothesis: Proportion with LR > 1

plot_proportions <- function(data, tested_hyp, threshold_col = "prop_LR_gt_1") {
  subset_data <- data[tested_relationship == tested_hyp]
  
  ggplot(subset_data, aes(x = loci_set, y = .data[[threshold_col]], 
                          color = known_relationship, group = known_relationship)) +
    geom_line(linewidth = 1) +
    geom_point(size = 2) +
    facet_wrap(~population, ncol = 5, labeller = labeller(population = population_labels)) +
    scale_color_manual(values = relationship_colors, labels = relationship_labels) +
    scale_x_discrete(labels = loci_set_labels) +
    scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
    labs(
      title = paste("Proportion with LR > 1 - Testing", relationship_labels[tested_hyp], "Hypothesis"),
      subtitle = "By population and true relationship",
      x = "Loci Set",
      y = "Proportion Exceeding LR > 1",
      color = "True Relationship"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
          legend.position = "bottom")
}

plot_proportions(proportions_all, "parent_child")

4.3 Full-Siblings Hypothesis: Proportion with LR > 1

plot_proportions(proportions_all, "full_siblings")

Key observation: Note whether the half-siblings line is INCREASING or DECREASING with more loci. An increasing trend means more loci → more false identifications!

4.4 Comparison: Parent-Child vs Full-Siblings Hypothesis

compare_data <- proportions_all[loci_set == "autosomal_29"]

ggplot(compare_data, aes(x = known_relationship, y = prop_LR_gt_1, 
                         fill = tested_relationship)) +
  geom_col(position = "dodge") +
  facet_wrap(~population, ncol = 5, labeller = labeller(population = population_labels)) +
  scale_fill_manual(values = c("parent_child" = "#1b9e77", "full_siblings" = "#d95f02"),
                    labels = relationship_labels) +
  scale_x_discrete(labels = function(x) str_wrap(relationship_labels[x], width = 10)) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title = "Comparison: Parent-Child vs Full-Siblings Hypothesis (29 Loci)",
    subtitle = "Which hypothesis better discriminates each true relationship?",
    x = "True Relationship",
    y = "Proportion with LR > 1",
    fill = "Tested Hypothesis"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

5 Statistical Tests

5.1 Test 1: Effect of Loci Set (Chi-square) - By Population

What this tests: Does the proportion exceeding LR > 1 differ significantly across loci sets?

Interpretation:

  • p < 0.05 + INCREASING trend: More loci increases the rate (good for TP, bad for FP)
  • p < 0.05 + DECREASING trend: More loci decreases the rate (bad for TP, good for FP)
  • p >= 0.05: No significant effect of number of loci
test_loci_effect <- function(data, pop, known_rel, tested_rel, threshold_col = "prop_LR_gt_1") {
  subset <- data[population == pop & known_relationship == known_rel & tested_relationship == tested_rel]
  
  if (nrow(subset) < 2) return(NULL)
  
  n_exceed <- round(subset[[threshold_col]] * subset$n)
  n_not_exceed <- subset$n - n_exceed
  
  if (any(n_exceed < 0) || any(n_not_exceed < 0)) return(NULL)
  if (sum(n_exceed) == 0 || sum(n_not_exceed) == 0) return(NULL)
  
  contingency <- matrix(c(n_exceed, n_not_exceed), nrow = 2, byrow = TRUE,
                       dimnames = list(c("Exceed", "Not_Exceed"), as.character(subset$loci_set)))
  
  test_result <- tryCatch(
    chisq.test(contingency, simulate.p.value = TRUE, B = 2000),
    error = function(e) NULL
  )
  
  if (is.null(test_result)) return(NULL)
  
  data.frame(
    population = pop,
    known_relationship = known_rel,
    tested_relationship = tested_rel,
    statistic = test_result$statistic,
    p_value = test_result$p.value,
    significant = test_result$p.value < 0.05
  )
}

# Run for all combinations
loci_tests <- rbindlist(lapply(population_order, function(pop) {
  rbindlist(lapply(relationship_order, function(known_rel) {
    rbindlist(lapply(tested_hypotheses, function(tested_rel) {
      test_loci_effect(proportions_all, pop, known_rel, tested_rel)
    }))
  }))
}))

# Display for Full-Siblings hypothesis
kable(loci_tests[tested_relationship == "full_siblings", 
                 .(population, known_relationship, statistic, p_value, significant)],
      caption = "Chi-square: Loci effect on LR > 1 (Full-Siblings Hypothesis)",
      digits = c(0, 0, 2, 4, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Chi-square: Loci effect on LR > 1 (Full-Siblings Hypothesis)
population known_relationship statistic p_value significant
all half_siblings 32.48 5e-04 TRUE
AfAm half_siblings 31.54 5e-04 TRUE
Asian half_siblings 39.76 5e-04 TRUE
Cauc half_siblings 25.75 1e-03 TRUE
Hispanic half_siblings 24.63 5e-04 TRUE
# Display for Parent-Child hypothesis
kable(loci_tests[tested_relationship == "parent_child", 
                 .(population, known_relationship, statistic, p_value, significant)],
      caption = "Chi-square: Loci effect on LR > 1 (Parent-Child Hypothesis)",
      digits = c(0, 0, 2, 4, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Chi-square: Loci effect on LR > 1 (Parent-Child Hypothesis)
population known_relationship statistic p_value significant
all full_siblings 33.82 0.0005 TRUE
all half_siblings 24.72 0.0005 TRUE
AfAm full_siblings 42.46 0.0005 TRUE
AfAm half_siblings 31.21 0.0005 TRUE
Asian full_siblings 12.82 0.0115 TRUE
Asian half_siblings 24.37 0.0005 TRUE
Cauc full_siblings 26.36 0.0010 TRUE
Cauc half_siblings 20.72 0.0010 TRUE
Hispanic full_siblings 38.76 0.0005 TRUE
Hispanic half_siblings 31.54 0.0005 TRUE
fwrite(loci_tests, file.path(params$output_dir, "stat_test_loci_effect.csv"))

5.2 Test 2: Effect of Population (Chi-square) - At 29 Loci

What this tests: Does the proportion exceeding LR > 1 differ across populations?

Interpretation:

  • p < 0.05: Population significantly affects rates → allele frequency database matters
  • p >= 0.05: Consistent across populations
test_pop_effect <- function(data, known_rel, tested_rel, threshold_col = "prop_LR_gt_1") {
  subset <- data[known_relationship == known_rel & 
                 tested_relationship == tested_rel & 
                 loci_set == "autosomal_29"]
  
  if (nrow(subset) < 2) return(NULL)
  
  n_exceed <- round(subset[[threshold_col]] * subset$n)
  n_not_exceed <- subset$n - n_exceed
  
  if (any(n_exceed < 0) || any(n_not_exceed < 0)) return(NULL)
  if (sum(n_exceed) == 0 || sum(n_not_exceed) == 0) return(NULL)
  
  contingency <- matrix(c(n_exceed, n_not_exceed), nrow = 2, byrow = TRUE,
                       dimnames = list(c("Exceed", "Not_Exceed"), as.character(subset$population)))
  
  test_result <- tryCatch(
    chisq.test(contingency, simulate.p.value = TRUE, B = 2000),
    error = function(e) NULL
  )
  
  if (is.null(test_result)) return(NULL)
  
  data.frame(
    known_relationship = known_rel,
    tested_relationship = tested_rel,
    statistic = test_result$statistic,
    p_value = test_result$p.value,
    significant = test_result$p.value < 0.05
  )
}

pop_tests <- rbindlist(lapply(relationship_order, function(known_rel) {
  rbindlist(lapply(tested_hypotheses, function(tested_rel) {
    test_pop_effect(proportions_all, known_rel, tested_rel)
  }))
}))

kable(pop_tests,
      caption = "Chi-square: Population effect on LR > 1 (29 Loci)",
      digits = c(0, 0, 2, 4, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Chi-square: Population effect on LR > 1 (29 Loci)
known_relationship tested_relationship statistic p_value significant
full_siblings parent_child 4.91 0.3478 FALSE
half_siblings parent_child 39.33 0.0005 TRUE
half_siblings full_siblings 29.29 0.0005 TRUE
fwrite(pop_tests, file.path(params$output_dir, "stat_test_population_effect.csv"))

5.3 Test 3: Linear Trend Analysis - By Population

What this tests: Is there a significant linear trend in proportion as loci increase?

Interpretation:

Metric Meaning
slope > 0 Proportion INCREASES with more loci
slope < 0 Proportion DECREASES with more loci
p < 0.05 Trend is statistically significant
R² > 0.8 Very consistent linear relationship
trend_tests <- proportions_all[, {
  if (.N < 3) return(NULL)
  
  model <- tryCatch(lm(prop_LR_gt_1 ~ n_loci), error = function(e) NULL)
  if (is.null(model)) return(NULL)
  
  summary_model <- summary(model)
  coef_table <- coef(summary_model)
  
  if (nrow(coef_table) < 2) return(NULL)
  
  list(
    slope = coef_table[2, 1],
    slope_se = coef_table[2, 2],
    p_value = coef_table[2, 4],
    r_squared = summary_model$r.squared,
    trend = ifelse(coef_table[2, 1] > 0, "INCREASING", "DECREASING"),
    significant = coef_table[2, 4] < 0.05
  )
}, by = .(population, known_relationship, tested_relationship)]

# Display for full_siblings hypothesis
kable(trend_tests[tested_relationship == "full_siblings", 
                  .(population, known_relationship, slope, p_value, r_squared, trend, significant)],
      caption = "Linear trend: Proportion vs N loci (Full-Siblings Hypothesis)",
      digits = c(0, 0, 5, 4, 3, 0, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Linear trend: Proportion vs N loci (Full-Siblings Hypothesis)
population known_relationship slope p_value r_squared trend significant
all parent_child 0.00000 0.2735 0.488 INCREASING FALSE
all full_siblings 0.00000 0.2735 0.488 INCREASING FALSE
all half_siblings 0.00535 0.0143 0.898 INCREASING TRUE
all cousins 0.00000 NaN NaN DECREASING NA
all second_cousins 0.00000 NaN NaN DECREASING NA
all unrelated 0.00000 NaN NaN DECREASING NA
AfAm parent_child 0.00000 0.2735 0.488 INCREASING FALSE
AfAm full_siblings 0.00000 0.2735 0.488 INCREASING FALSE
AfAm half_siblings 0.00380 0.0294 0.837 INCREASING TRUE
AfAm cousins 0.00000 NaN NaN DECREASING NA
AfAm second_cousins 0.00000 NaN NaN DECREASING NA
AfAm unrelated 0.00000 NaN NaN DECREASING NA
Asian parent_child 0.00000 0.2735 0.488 INCREASING FALSE
Asian full_siblings 0.00000 0.2735 0.488 INCREASING FALSE
Asian half_siblings 0.00739 0.0098 0.920 INCREASING TRUE
Asian cousins 0.00000 NaN NaN DECREASING NA
Asian second_cousins 0.00000 NaN NaN DECREASING NA
Asian unrelated 0.00000 NaN NaN DECREASING NA
Cauc parent_child 0.00000 0.2735 0.488 INCREASING FALSE
Cauc full_siblings 0.00000 0.2735 0.488 INCREASING FALSE
Cauc half_siblings 0.00393 0.0180 0.882 INCREASING TRUE
Cauc cousins 0.00000 NaN NaN DECREASING NA
Cauc second_cousins 0.00000 NaN NaN DECREASING NA
Cauc unrelated 0.00000 NaN NaN DECREASING NA
Hispanic parent_child 0.00000 0.2735 0.488 INCREASING FALSE
Hispanic full_siblings 0.00000 0.2735 0.488 INCREASING FALSE
Hispanic half_siblings 0.00539 0.0054 0.946 INCREASING TRUE
Hispanic cousins 0.00000 NaN NaN DECREASING NA
Hispanic second_cousins 0.00000 NaN NaN DECREASING NA
Hispanic unrelated 0.00000 NaN NaN DECREASING NA
# Display for parent_child hypothesis
kable(trend_tests[tested_relationship == "parent_child", 
                  .(population, known_relationship, slope, p_value, r_squared, trend, significant)],
      caption = "Linear trend: Proportion vs N loci (Parent-Child Hypothesis)",
      digits = c(0, 0, 5, 4, 3, 0, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Linear trend: Proportion vs N loci (Parent-Child Hypothesis)
population known_relationship slope p_value r_squared trend significant
all parent_child 0.00000 0.2735 0.488 INCREASING FALSE
all full_siblings 0.00223 0.0582 0.749 INCREASING FALSE
all half_siblings -0.00517 0.0580 0.749 DECREASING FALSE
all cousins 0.00000 NaN NaN DECREASING NA
all second_cousins 0.00000 NaN NaN DECREASING NA
all unrelated 0.00000 NaN NaN DECREASING NA
AfAm parent_child 0.00000 0.2735 0.488 INCREASING FALSE
AfAm full_siblings 0.00167 0.1149 0.618 INCREASING FALSE
AfAm half_siblings -0.00788 0.0048 0.950 DECREASING TRUE
AfAm cousins 0.00000 NaN NaN DECREASING NA
AfAm second_cousins 0.00000 NaN NaN DECREASING NA
AfAm unrelated 0.00000 NaN NaN DECREASING NA
Asian parent_child 0.00000 0.2735 0.488 INCREASING FALSE
Asian full_siblings 0.00195 0.0052 0.947 INCREASING TRUE
Asian half_siblings -0.00393 0.0844 0.683 DECREASING FALSE
Asian cousins 0.00000 NaN NaN DECREASING NA
Asian second_cousins 0.00000 NaN NaN DECREASING NA
Asian unrelated 0.00000 NaN NaN DECREASING NA
Cauc parent_child 0.00000 0.2735 0.488 INCREASING FALSE
Cauc full_siblings 0.00185 0.0415 0.797 INCREASING TRUE
Cauc half_siblings -0.00568 0.0092 0.923 DECREASING TRUE
Cauc cousins 0.00000 NaN NaN DECREASING NA
Cauc second_cousins 0.00000 NaN NaN DECREASING NA
Cauc unrelated 0.00000 NaN NaN DECREASING NA
Hispanic parent_child 0.00000 0.2735 0.488 INCREASING FALSE
Hispanic full_siblings 0.00304 0.0306 0.833 INCREASING TRUE
Hispanic half_siblings -0.00596 0.0057 0.944 DECREASING TRUE
Hispanic cousins 0.00000 NaN NaN DECREASING NA
Hispanic second_cousins 0.00000 NaN NaN DECREASING NA
Hispanic unrelated 0.00000 NaN NaN DECREASING NA
fwrite(trend_tests, file.path(params$output_dir, "stat_test_trend.csv"))

Critical finding: If half_siblings under Full-Siblings hypothesis shows: - trend = “INCREASING” - significant = TRUE

This confirms that more loci increases the rate at which half-siblings are falsely identified as full siblings.

6 Classification Performance

6.1 Summary at 29 Loci

summary_29 <- proportions_all[loci_set == "autosomal_29"]

ggplot(summary_29, aes(x = known_relationship, y = prop_LR_gt_1, fill = classification)) +
  geom_col(position = "dodge") +
  facet_grid(tested_relationship ~ population,
             labeller = labeller(population = population_labels,
                                 tested_relationship = relationship_labels)) +
  scale_fill_manual(values = c("True Positive" = "#2ca02c", 
                               "Related FP" = "#ff7f0e", 
                               "Unrelated FP" = "#d62728")) +
  scale_x_discrete(labels = function(x) str_wrap(relationship_labels[x], width = 8)) +
  scale_y_continuous(labels = percent_format()) +
  labs(
    title = "Classification Performance at 29 Autosomal Loci",
    subtitle = "Rows = Tested Hypothesis, Columns = Population",
    x = "True Relationship",
    y = "Proportion with LR > 1",
    fill = "Classification"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
        legend.position = "bottom")

ggsave(file.path(params$output_dir, "classification_summary.png"), width = 14, height = 6, dpi = 300)

6.2 Detailed Rates Table

rates_table <- proportions_all[
  loci_set == "autosomal_29",
  .(population, known_relationship, tested_relationship, classification, 
    prop_LR_gt_1, prop_LR_gt_10, prop_LR_gt_100, prop_LR_gt_1000)
]

setorder(rates_table, tested_relationship, population, classification, known_relationship)

kable(rates_table,
      caption = "Detailed rates at 29 Autosomal Loci",
      digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), font_size = 10) %>%
  scroll_box(height = "500px")
Detailed rates at 29 Autosomal Loci
population known_relationship tested_relationship classification prop_LR_gt_1 prop_LR_gt_10 prop_LR_gt_100 prop_LR_gt_1000
all full_siblings parent_child Related FP 0.996 0.958 0.796 0.436
all half_siblings parent_child Related FP 0.104 0.020 0.000 0.000
all cousins parent_child Related FP 0.000 0.000 0.000 0.000
all second_cousins parent_child Related FP 0.000 0.000 0.000 0.000
all parent_child parent_child True Positive 1.000 1.000 1.000 1.000
all unrelated parent_child Unrelated FP 0.000 0.000 0.000 0.000
AfAm full_siblings parent_child Related FP 1.000 0.990 0.876 0.632
AfAm half_siblings parent_child Related FP 0.160 0.032 0.004 0.000
AfAm cousins parent_child Related FP 0.000 0.000 0.000 0.000
AfAm second_cousins parent_child Related FP 0.000 0.000 0.000 0.000
AfAm parent_child parent_child True Positive 1.000 1.000 1.000 1.000
AfAm unrelated parent_child Unrelated FP 0.000 0.000 0.000 0.000
Asian full_siblings parent_child Related FP 0.992 0.910 0.680 0.320
Asian half_siblings parent_child Related FP 0.062 0.002 0.000 0.000
Asian cousins parent_child Related FP 0.000 0.000 0.000 0.000
Asian second_cousins parent_child Related FP 0.000 0.000 0.000 0.000
Asian parent_child parent_child True Positive 1.000 1.000 1.000 1.000
Asian unrelated parent_child Unrelated FP 0.000 0.000 0.000 0.000
Cauc full_siblings parent_child Related FP 0.998 0.972 0.854 0.570
Cauc half_siblings parent_child Related FP 0.116 0.016 0.000 0.000
Cauc cousins parent_child Related FP 0.000 0.000 0.000 0.000
Cauc second_cousins parent_child Related FP 0.000 0.000 0.000 0.000
Cauc parent_child parent_child True Positive 1.000 1.000 1.000 1.000
Cauc unrelated parent_child Unrelated FP 0.000 0.000 0.000 0.000
Hispanic full_siblings parent_child Related FP 0.996 0.948 0.750 0.402
Hispanic half_siblings parent_child Related FP 0.058 0.004 0.002 0.000
Hispanic cousins parent_child Related FP 0.000 0.000 0.000 0.000
Hispanic second_cousins parent_child Related FP 0.000 0.000 0.000 0.000
Hispanic parent_child parent_child True Positive 1.000 1.000 1.000 1.000
Hispanic unrelated parent_child Unrelated FP 0.000 0.000 0.000 0.000
all parent_child full_siblings Related FP 1.000 1.000 1.000 1.000
all half_siblings full_siblings Related FP 0.968 0.802 0.428 0.142
all cousins full_siblings Related FP 0.000 0.000 0.000 0.000
all second_cousins full_siblings Related FP 0.000 0.000 0.000 0.000
all full_siblings full_siblings True Positive 1.000 1.000 1.000 1.000
all unrelated full_siblings Unrelated FP 0.000 0.000 0.000 0.000
AfAm parent_child full_siblings Related FP 1.000 1.000 1.000 1.000
AfAm half_siblings full_siblings Related FP 0.978 0.872 0.588 0.268
AfAm cousins full_siblings Related FP 0.000 0.000 0.000 0.000
AfAm second_cousins full_siblings Related FP 0.000 0.000 0.000 0.000
AfAm full_siblings full_siblings True Positive 1.000 1.000 1.000 1.000
AfAm unrelated full_siblings Unrelated FP 0.000 0.000 0.000 0.000
Asian parent_child full_siblings Related FP 1.000 1.000 1.000 1.000
Asian half_siblings full_siblings Related FP 0.916 0.690 0.352 0.086
Asian cousins full_siblings Related FP 0.000 0.000 0.000 0.000
Asian second_cousins full_siblings Related FP 0.000 0.000 0.000 0.000
Asian full_siblings full_siblings True Positive 1.000 1.000 1.000 1.000
Asian unrelated full_siblings Unrelated FP 0.000 0.000 0.000 0.000
Cauc parent_child full_siblings Related FP 1.000 1.000 1.000 1.000
Cauc half_siblings full_siblings Related FP 0.966 0.824 0.504 0.186
Cauc cousins full_siblings Related FP 0.000 0.000 0.000 0.000
Cauc second_cousins full_siblings Related FP 0.000 0.000 0.000 0.000
Cauc full_siblings full_siblings True Positive 1.000 1.000 1.000 1.000
Cauc unrelated full_siblings Unrelated FP 0.000 0.000 0.000 0.000
Hispanic parent_child full_siblings Related FP 1.000 1.000 1.000 1.000
Hispanic half_siblings full_siblings Related FP 0.938 0.706 0.360 0.096
Hispanic cousins full_siblings Related FP 0.000 0.000 0.000 0.000
Hispanic second_cousins full_siblings Related FP 0.000 0.000 0.000 0.000
Hispanic full_siblings full_siblings True Positive 1.000 1.000 1.000 1.000
Hispanic unrelated full_siblings Unrelated FP 0.000 0.000 0.000 0.000
fwrite(rates_table, file.path(params$output_dir, "detailed_rates_29loci.csv"))

7 Key Findings

cat("=== KEY FINDINGS ===\n\n")
## === KEY FINDINGS ===
cat("1. TRUE POSITIVE RATES (at 29 loci, LR > 1):\n")
## 1. TRUE POSITIVE RATES (at 29 loci, LR > 1):
tp_rates <- proportions_all[loci_set == "autosomal_29" & 
                            as.character(known_relationship) == as.character(tested_relationship),
                            .(population, tested_relationship, prop_LR_gt_1)]
print(dcast(tp_rates, tested_relationship ~ population, value.var = "prop_LR_gt_1"))
## Key: <tested_relationship>
##    tested_relationship   all  AfAm Asian  Cauc Hispanic
##                 <fctr> <num> <num> <num> <num>    <num>
## 1:        parent_child     1     1     1     1        1
## 2:       full_siblings     1     1     1     1        1
cat("\n2. UNRELATED FALSE POSITIVE RATES (at 29 loci, LR > 1):\n")
## 
## 2. UNRELATED FALSE POSITIVE RATES (at 29 loci, LR > 1):
unrel_fp <- proportions_all[loci_set == "autosomal_29" & 
                            known_relationship == "unrelated",
                            .(population, tested_relationship, prop_LR_gt_1)]
print(dcast(unrel_fp, tested_relationship ~ population, value.var = "prop_LR_gt_1"))
## Key: <tested_relationship>
##    tested_relationship   all  AfAm Asian  Cauc Hispanic
##                 <fctr> <num> <num> <num> <num>    <num>
## 1:        parent_child     0     0     0     0        0
## 2:       full_siblings     0     0     0     0        0
cat("\n3. HALF-SIBLING AS FULL-SIBLING FALSE ID (at 29 loci):\n")
## 
## 3. HALF-SIBLING AS FULL-SIBLING FALSE ID (at 29 loci):
hs_fp <- proportions_all[loci_set == "autosomal_29" & 
                         known_relationship == "half_siblings" &
                         tested_relationship == "full_siblings",
                         .(population, prop_LR_gt_1, prop_LR_gt_10, prop_LR_gt_100)]
print(hs_fp)
##    population prop_LR_gt_1 prop_LR_gt_10 prop_LR_gt_100
##        <fctr>        <num>         <num>          <num>
## 1:        all        0.968         0.802          0.428
## 2:       AfAm        0.978         0.872          0.588
## 3:      Asian        0.916         0.690          0.352
## 4:       Cauc        0.966         0.824          0.504
## 5:   Hispanic        0.938         0.706          0.360
cat("\n4. TREND DIRECTIONS (Full-Siblings hypothesis, half-siblings):\n")
## 
## 4. TREND DIRECTIONS (Full-Siblings hypothesis, half-siblings):
hs_trend <- trend_tests[tested_relationship == "full_siblings" & known_relationship == "half_siblings"]
print(hs_trend[, .(population, trend, slope, p_value, significant)])
##    population      trend       slope     p_value significant
##        <fctr>     <char>       <num>       <num>      <lgcl>
## 1:        all INCREASING 0.005353659 0.014302191        TRUE
## 2:       AfAm INCREASING 0.003804878 0.029444892        TRUE
## 3:      Asian INCREASING 0.007390244 0.009790329        TRUE
## 4:       Cauc INCREASING 0.003926829 0.017968074        TRUE
## 5:   Hispanic INCREASING 0.005390244 0.005364040        TRUE

8 Interpretation Summary

8.1 Quick Reference

What to Check Good Sign Concerning Sign
True Positive Rate High (>90%) and increasing with loci Low or decreasing
Unrelated FP Rate Low (<5%) and decreasing with loci High or increasing
Related FP Rate (e.g., half-sibs) Low and decreasing with loci High and INCREASING
Population effect Not significant Highly significant

8.2 Key Questions Answered

  1. Does adding more loci help?
    • For true positives: YES (should increase)
    • For unrelated FP: YES (should decrease)
    • For related FP like half-siblings: DEPENDS on the relationship
  2. Are results consistent across populations?
    • Check the population effect chi-square tests
    • Significant = ancestry matters for this relationship
  3. Which hypothesis is better for discrimination?
    • Compare the classification summary plot

9 Session Info

cat("Analysis completed:", format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "\n")
## Analysis completed: 2025-12-12 14:09:38
cat("Output directory:", params$output_dir, "\n\n")
## Output directory: ../output/analysis_results
output_files <- list.files(params$output_dir, full.names = FALSE)
cat("Generated files:\n")
## Generated files:
cat(paste(" -", output_files), sep = "\n")
##  - classification_summary.png
##  - combined_LR_summary_by_loci.csv
##  - detailed_rates_29loci.csv
##  - false_identification_rates.png
##  - fixed_threshold_trends.png
##  - mean_log10_LR_summary.csv
##  - proportions_all_combinations.csv
##  - proportions_fixed_thresholds.csv
##  - stat_test_loci_effect_stratified.csv
##  - stat_test_loci_effect.csv
##  - stat_test_population_effect.csv
##  - stat_test_trend_analysis.csv
##  - stat_test_trend_stratified.csv
##  - stat_test_trend.csv
##  - summary_rates.csv
##  - tpr_vs_fpr.png
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Detroit
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0  knitr_1.50        scales_1.4.0      data.table_1.17.4
##  [5] lubridate_1.9.4   forcats_1.0.0     stringr_1.5.1     dplyr_1.1.4      
##  [9] purrr_1.0.4       readr_2.1.5       tidyr_1.3.1       tibble_3.2.1     
## [13] ggplot2_4.0.0     tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.3.8         stringi_1.8.7     
##  [5] hms_1.1.3          digest_0.6.37      magrittr_2.0.3     evaluate_1.0.3    
##  [9] grid_4.5.2         timechange_0.3.0   RColorBrewer_1.1-3 fastmap_1.2.0     
## [13] jsonlite_2.0.0     viridisLite_0.4.2  textshaping_1.0.1  jquerylib_0.1.4   
## [17] cli_3.6.5          rlang_1.1.6        withr_3.0.2        cachem_1.1.0      
## [21] yaml_2.3.10        tools_4.5.2        tzdb_0.5.0         vctrs_0.6.5       
## [25] R6_2.6.1           lifecycle_1.0.4    ragg_1.4.0         pkgconfig_2.0.3   
## [29] pillar_1.10.2      bslib_0.9.0        gtable_0.3.6       glue_1.8.0        
## [33] systemfonts_1.3.1  xfun_0.54          tidyselect_1.2.1   rstudioapi_0.17.1 
## [37] dichromat_2.0-0.1  farver_2.1.2       htmltools_0.5.8.1  rmarkdown_2.29    
## [41] svglite_2.2.1      labeling_0.4.3     compiler_4.5.2     S7_0.2.0